library(here)
library(tidyverse)
library(lubridate)
library(countrycode) # to code region below
library(jsonlite)
library(janitor)
library(stringi)

knitr::opts_chunk$set(echo = TRUE)
# function to filter out columns with all NA rows
all_na <- function(x) any(!is.na(x))


## ###################################################
## # Import People file from EM
## ###################################################
## # Download from: Reports - Custom Report - People and Address - 'Download the data file in Unicode format - UTF-8'
## people <- read_delim(here("data", "PEOPLEADDRESS_UNICODE.tab"),
##                      col_types = cols_only("People Unique ID"  = col_double(),
##                       "First Name" = col_character(),
##                       "Middle Name" = col_character(),
##                       "Last Name" = col_character(),
##                       "Title" = col_character(),
##                       "Degree" = col_character(),
##                       "Position" = col_character(),
##                       "Department" = col_character(),
##                       "Institution" = col_character(),
##                       "City" = col_character(),
##                       "State" = col_character(),
##                       "Zip" = col_character(),
##                       "Country" = col_character(),
##                       "E-mail Address" = col_character(),
##                       "ORCID" = col_character(),
##                       "People Record Last Update Date" = col_character()),
##                      delim = "\t", quote = "", locale = default_locale(),
##                      guess_max = 50000)
## 
## # clean up names
## people = people %>%
##   clean_names() # janitor tool that auto-cleans column names


## # Generate a function that cleans the names - From Thomas's code
## clean.names <- function(x, delete.missing=FALSE){
##   str_sub(x,1,1) <- toupper(str_sub(x,1,1)) # make sure first character is capitalized
##   names <- str_extract(x, "[[:upper:]]{1}([[:lower:]]|[[:upper:]])+") # extract the first part of the names
##   names <- stri_trans_general(names, "latin-ascii") # clean special characters
##   if(delete.missing==TRUE){names <- names[is.na(names)==FALSE]} # delete missings
##   return(names)
## }


## # Use genderize a simple API to determine the gender of a name
## ##############################################################
## 
## # Select names and pass through the name cleaning function
## people = people %>%
##   mutate(firstname_clean = clean.names(first_name), #pass names through cleaning function
##          middlename_clean = clean.names(middle_name),
##          country_iso2c = countryname(country, destination = "iso2c"),
##          country_iso2c = if_else(country_iso2c == "Yugoslavia", "YU", country_iso2c),
##          url1 = str_c("https://api.genderize.io/?name=",firstname_clean, #generate urls with country
##                       "&country_id=",country_iso2c,"&apikey=ENTERYOURAPIKEYHERE", sep=""),
##          url2 = str_c("https://api.genderize.io/?name=",middlename_clean,
##                       "&country_id=",country_iso2c,"&apikey=ENTERYOURAPIKEYHERE", sep=""),
##          url3 = str_c("https://api.genderize.io/?name=",firstname_clean,
##                       "&apikey=ENTERYOURAPIKEYHERE", sep=""),
##          url4 = str_c("https://api.genderize.io/?name=",middlename_clean,
##                       "&apikey=ENTERYOURAPIKEYHERE", sep=""),
##          gender1 = NA,
##          pr1 = NA,
##          gender2 = NA,
##          pr2 = NA,
##          gender3 = NA,
##          pr3 = NA,
##          gender4 = NA,
##          pr4 = NA)


## ## Divide Dataset into quarters
## people_q1 <- people %>%
##   slice(1:10000)
## 
## people_q2 <- people %>%
##   slice(10001:20000)
## 
## people_q3 <- people %>%
##   slice(20001:30000)
## 
## people_q4 <- people %>%
##   slice(30000:nrow(people))


## # Use genderize a simple API to determine the gender of a name
## ##############################################################
## 
## # First Quarter
## # Gender 1
## for(i in 1:nrow(people_q1)){
##     if(is.na(people_q1$url1[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q1$url1[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q1$gender1[i] = genderize$gender
## 		  people_q1$pr1[i] = genderize$probability
##     }
##   }
##  }
## 
## # Gender 2
##  for(i in 1:nrow(people_q1)){
##     if(is.na(people_q1$url2[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q1$url2[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q1$gender2[i] = genderize$gender
## 		  people_q1$pr2[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 3
##  for(i in 1:nrow(people_q1)){
##     if(is.na(people_q1$url3[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q1$url3[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q1$gender3[i] = genderize$gender
## 		  people_q1$pr3[i] = genderize$probability
##     }
##   }
##  }
## 
## # Gender 4
##  for(i in 1:nrow(people_q1)){
##     if(is.na(people_q1$url4[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q1$url4[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q1$gender4[i] = genderize$gender
## 		  people_q1$pr4[i] = genderize$probability
##     }
##   }
##  }
## 
## write_csv(people_q1, here("data", "gender", "people_q1.csv"))
## 
## # Second Quarter
## # Gender 1
## for(i in 1:nrow(people_q2)){
##     if(is.na(people_q2$url1[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q2$url1[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q2$gender1[i] = genderize$gender
## 		  people_q2$pr1[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 2
##  for(i in 1:nrow(people_q2)){
##     if(is.na(people_q2$url2[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q2$url2[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q2$gender2[i] = genderize$gender
## 		  people_q2$pr2[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 3
##  for(i in 1:nrow(people_q2)){
##     if(is.na(people_q2$url3[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q2$url3[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q2$gender3[i] = genderize$gender
## 		  people_q2$pr3[i] = genderize$probability
##     }
##   }
##  }
## 
## # Gender 4
##  for(i in 1:nrow(people_q2)){
##     if(is.na(people_q2$url4[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q2$url4[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q2$gender4[i] = genderize$gender
## 		  people_q2$pr4[i] = genderize$probability
##     }
##   }
##  }
## 
## write_csv(people_q2, here("data", "gender", "people_q2.csv"))
## 
## # Third Quarter
## # Gender 1
## for(i in 1:nrow(people_q3)){
##     if(is.na(people_q3$url1[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q3$url1[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q3$gender1[i] = genderize$gender
## 		  people_q3$pr1[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 2
##  for(i in 1:nrow(people_q3)){
##     if(is.na(people_q3$url2[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q3$url2[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q3$gender2[i] = genderize$gender
## 		  people_q3$pr2[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 3
##  for(i in 1:nrow(people_q3)){
##     if(is.na(people_q3$url3[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q3$url3[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q3$gender3[i] = genderize$gender
## 		  people_q3$pr3[i] = genderize$probability
##     }
##   }
##  }
## 
## # Gender 4
##  for(i in 1:nrow(people_q3)){
##     if(is.na(people_q3$url4[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q3$url4[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q3$gender4[i] = genderize$gender
## 		  people_q3$pr4[i] = genderize$probability
##     }
##   }
##  }
## 
## write_csv(people_q3, here("data", "gender", "people_q3.csv"))
## 
## # Fourth Quarter
## # Gender 1
## for(i in 1:nrow(people_q4)){
##     if(is.na(people_q4$url1[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q4$url1[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q4$gender1[i] = genderize$gender
## 		  people_q4$pr1[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 2
##  for(i in 1:nrow(people_q4)){
##     if(is.na(people_q4$url2[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q4$url2[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q4$gender2[i] = genderize$gender
## 		  people_q4$pr2[i] = genderize$probability
##     }
##   }
##   }
## 
## # Gender 3
##  for(i in 1:nrow(people_q4)){
##     if(is.na(people_q4$url3[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q4$url3[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q4$gender3[i] = genderize$gender
## 		  people_q4$pr3[i] = genderize$probability
##     }
##   }
##  }
## 
## # Gender 4
##  for(i in 1:nrow(people_q4)){
##     if(is.na(people_q4$url4[i])){
##       print(i)
##     }else{
##       genderize = fromJSON(people_q4$url4[i], flatten = TRUE)
##       if(is.null(genderize$gender)){
##         print(i)
##     }else{
## 		  people_q4$gender4[i] = genderize$gender
## 		  people_q4$pr4[i] = genderize$probability
##     }
##   }
##  }
## 
## write_csv(people_q4, here("data", "gender", "people_q4.csv"))


## Merge four datasets
people_q1<-read_csv(here("data", "gender", "people_q1.csv"),
                      col_types = cols_only("first_name" = col_character(),
                                            "middle_name" = col_character(),
                                            "last_name" = col_character(),
                                            "title" = col_character(),
                                            "degree" = col_character(),
                                            "people_record_last_update_date" = col_character(),
                                            "position" = col_character(),
                                            "department" = col_character(),
                                            "institution" = col_character(),
                                            "city" = col_character(),
                                            "state" = col_character(),
                                            "zip" = col_character(),
                                            "country" = col_character(),
                                            "e_mail_address" = col_character(),
                                            "orcid" = col_character(),
                                            "people_unique_id"  = col_double(),
                                            "firstname_clean" = col_character(),
                                            "middlename_clean" = col_character(),
                                            "country_iso2c" = col_character(),
                                            "gender1" = col_factor(),
                                            "gender2" = col_factor(),
                                            "gender3" = col_factor(),
                                            "gender4" = col_factor(),
                                            "pr1" = col_double(),
                                            "pr2" = col_double(),
                                            "pr3" = col_double(),
                                            "pr4" = col_double()),
                      locale = default_locale())


people_q2<-read_csv(here("data", "gender", "people_q2.csv"),
                      col_types = cols_only("first_name" = col_character(),
                                            "middle_name" = col_character(),
                                            "last_name" = col_character(),
                                            "title" = col_character(),
                                            "degree" = col_character(),
                                            "people_record_last_update_date" = col_character(),
                                            "position" = col_character(),
                                            "department" = col_character(),
                                            "institution" = col_character(),
                                            "city" = col_character(),
                                            "state" = col_character(),
                                            "zip" = col_character(),
                                            "country" = col_character(),
                                            "e_mail_address" = col_character(),
                                            "orcid" = col_character(),
                                            "people_unique_id"  = col_double(),
                                            "firstname_clean" = col_character(),
                                            "middlename_clean" = col_character(),
                                            "country_iso2c" = col_character(),
                                            "gender1" = col_factor(),
                                            "gender2" = col_factor(),
                                            "gender3" = col_factor(),
                                            "gender4" = col_factor(),
                                            "pr1" = col_double(),
                                            "pr2" = col_double(),
                                            "pr3" = col_double(),
                                            "pr4" = col_double()),
                      locale = default_locale())

people_q3<-read_csv(here("data", "gender", "people_q3.csv"),
                      col_types = cols_only("first_name" = col_character(),
                                            "middle_name" = col_character(),
                                            "last_name" = col_character(),
                                            "title" = col_character(),
                                            "degree" = col_character(),
                                            "people_record_last_update_date" = col_character(),
                                            "position" = col_character(),
                                            "department" = col_character(),
                                            "institution" = col_character(),
                                            "city" = col_character(),
                                            "state" = col_character(),
                                            "zip" = col_character(),
                                            "country" = col_character(),
                                            "e_mail_address" = col_character(),
                                            "orcid" = col_character(),
                                            "people_unique_id"  = col_double(),
                                            "firstname_clean" = col_character(),
                                            "middlename_clean" = col_character(),
                                            "country_iso2c" = col_character(),
                                            "gender1" = col_factor(),
                                            "gender2" = col_factor(),
                                            "gender3" = col_factor(),
                                            "gender4" = col_factor(),
                                            "pr1" = col_double(),
                                            "pr2" = col_double(),
                                            "pr3" = col_double(),
                                            "pr4" = col_double()),
                      locale = default_locale())

people_q4<-read_csv(here("data", "gender", "people_q4.csv"),
                      col_types = cols_only("first_name" = col_character(),
                                            "middle_name" = col_character(),
                                            "last_name" = col_character(),
                                            "title" = col_character(),
                                            "degree" = col_character(),
                                            "people_record_last_update_date" = col_character(),
                                            "position" = col_character(),
                                            "department" = col_character(),
                                            "institution" = col_character(),
                                            "city" = col_character(),
                                            "state" = col_character(),
                                            "zip" = col_character(),
                                            "country" = col_character(),
                                            "e_mail_address" = col_character(),
                                            "orcid" = col_character(),
                                            "people_unique_id"  = col_double(),
                                            "firstname_clean" = col_character(),
                                            "middlename_clean" = col_character(),
                                            "country_iso2c" = col_character(),
                                            "gender1" = col_factor(),
                                            "gender2" = col_factor(),
                                            "gender3" = col_factor(),
                                            "gender4" = col_factor(),
                                            "pr1" = col_double(),
                                            "pr2" = col_double(),
                                            "pr3" = col_double(),
                                            "pr4" = col_double()),
                      locale = default_locale())

people_import = people 
people_previous = people_q1 %>%
  bind_rows(people_q2, people_q3, people_q4)
people = unique(people_previous)


people = people %>%
  mutate(pr1bin = cut_width(pr1, width = .1, boundary = 0),
         pr2bin = cut_width(pr2, width = .1, boundary = 0),
         pr3bin = cut_width(pr3, width = .1, boundary = 0),
         pr4bin = cut_width(pr4, width = .1, boundary = 0))

people %>%
  ggplot(aes(pr1)) +
  geom_density() + 
  theme_minimal()

people %>%
  ggplot(aes(pr1bin)) +
  geom_histogram(stat = "count") + 
  theme_minimal()

people %>%
  ggplot(aes(pr2)) +
  geom_density() + 
  theme_minimal()

people %>%
  ggplot(aes(pr2bin)) +
  geom_histogram(stat = "count") + 
  theme_minimal()

people %>%
  ggplot(aes(pr3)) +
  geom_density() + 
  theme_minimal()

people %>%
  ggplot(aes(pr3bin)) +
  geom_histogram(stat = "count") + 
  theme_minimal()

people %>%
  ggplot(aes(pr4)) +
  geom_density() + 
  theme_minimal()

people %>%
  ggplot(aes(pr4bin)) +
  geom_histogram(stat = "count") + 
  theme_minimal()



# create empty NA variable 
people$gender_api = NA
people$gender_api = as_factor(people$gender_api)
people$gender1 = as_factor(people$gender1)
people$gender2 = as_factor(people$gender2)
people$gender3 = as_factor(people$gender3)
people$gender4 = as_factor(people$gender4)

# if pr1 is >=.7 then use gender1, else same
people$gender_api = if_else(is.na(people$gender_api) & people$pr1 >= .7, 
                              people$gender1, people$gender_api)
table(people$gender_api, exclude = NULL)

# if na then if pr3 is >=.7 then use gender 3, else same
people$gender_api = if_else(is.na(people$gender_api) & people$pr3 >= .7, 
                              people$gender3, people$gender_api)
table(people$gender_api, exclude = NULL)

# if na then if firstname is na and pr2 > .7, then gender2
people$gender_api = if_else(is.na(people$gender_api) & is.na(people$firstname_clean) & people$pr2 >= .7, 
                              people$gender2, people$gender_api)
table(people$gender_api, exclude = NULL)

# if na then if firstname is na and pr4 > .7, then gender4
people$gender_api = if_else(is.na(people$gender_api) & is.na(people$firstname_clean) & people$pr4 >= .7, 
                              people$gender4, people$gender_api)
table(people$gender_api, exclude = NULL)

# code TRUE if confident in gender coding: gender_confid
people$gender_confid = NA
people$gender_confid = if_else(!is.na(people$gender_api), TRUE, people$gender_confid)
table(people$gender_api, people$gender_confid, exclude = NULL)

# if na then if gender1 and gender2 same, then gender 1 (note to be same must be pr >=.5)
people$gender_api = if_else(is.na(people$gender_api) & people$gender1==people$gender2, 
                              people$gender1, people$gender_api)
table(people$gender_api, exclude = NULL)

# if na then if gender3 and gender4 same, then gender 3 (note to be same must be pr >=.5)
people$gender_api = if_else(is.na(people$gender_api) & people$gender3==people$gender4, 
                              people$gender3, people$gender_api)
table(people$gender_api, exclude = NULL)

# if na then if gender1 and gender3 same, then gender 1 (note to be same must be pr >=.5)
people$gender_api = if_else(is.na(people$gender_api) & people$gender1==people$gender3, 
                              people$gender1, people$gender_api)
table(people$gender_api, exclude = NULL)

# if na then if gender2 and gender4 same, then gender 2 (note to be same must be pr >=.5)
people$gender_api = if_else(is.na(people$gender_api) & is.na(people$firstname_clean) &
                                people$gender2==people$gender4, people$gender2, people$gender_api)
table(people$gender_api, exclude = NULL)

# table of those coded and confidence
table(people$gender_api, people$gender_confid, exclude = NULL)

# recode the FALSEs that are NA to NA
people$gender_confid = if_else(people$gender_api != "" & is.na(people$gender_confid), FALSE, people$gender_confid)


# table(rows, cols)
table(people$gender_api, people$gender_confid, exclude = NULL)
table(people$gender_api, exclude = NULL)


# add code for Mr./Mrs., Ms., Miss
# create empty NA variabel 
people$gender_title = NA_character_
# if title is Mr
people$gender_title = if_else(is.na(people$gender_title) & people$title == "Mr.", 
                              "male", people$gender_title)
people$gender_title = if_else(is.na(people$gender_title) & people$title == "Mrs.", 
                              "female", people$gender_title)
people$gender_title = if_else(is.na(people$gender_title) & people$title == "Ms.", 
                              "female", people$gender_title)
people$gender_title = if_else(is.na(people$gender_title) & people$title == "Miss", 
                              "male", people$gender_title)


people$gender_title = as_factor(people$gender_title)

table(people$gender_title, exclude = NULL)

table(people$gender_api, people$gender_title, exclude = NULL)

titf_codem = people %>%
  filter(gender_api == "male" & gender_title == "female")
nrow(titf_codem)

titfm_codef = people %>%
  filter(gender_api == "female" & gender_title == "male")
nrow(titfm_codef)


# import the author data
author_questions <- read_delim(here("data","SUBMISSIONS_ALL_AUTHORS_QUESTIONNAIRES_VIEW_UNICODE.tab"),
                           col_types = cols_only("Manuscript Number" = col_character(),
                                                 "Question Text" = col_character(),
                                                 "Author Response" = col_character(),
                                                 "Asked at Initial Submission" = col_double(),
                                                 "Asked at Revision" = col_double(),
                                                 "Author's People ID" = col_double(),             
                                                 "Revision Independent Author ID" = col_character(),
                                                 "Author Type" = col_character(),
                                                 "Author's Title" = col_character(),
                                                 "Author's First Name" = col_character(),
                                                 "Author's Middle Name" = col_character(),
                                                 "Author's Last Name" = col_character(), 
                                                 "Author's Academic Degree" = col_character(), 
                                                 "Author's E-mail address" = col_character(),
                                                 "Author's Country" = col_character(), 
                                                 "Author's Affiliation" = col_character(),
                                                 "Author's Institution"  = col_character(),        
                                                 "Author's Institution ID" = col_character(),
                                                 "Author's Department" = col_character(),
                                                 "Author's Address Line 1"  = col_character(),
                                                 "Author's Address Line 2" = col_character(),
                                                 "Author's Address Line 3" = col_character(),
                                                 "Author's Address Line 4" = col_character(),
                                                 "Author's City" = col_character(),
                                                 "Author's State/Province" = col_character(),
                                                 "Author's Zip/Postal Code" = col_character(), 
                                                 "Author's Order" = col_double()),
                           delim = "\t", quote = "", locale = default_locale(), 
                           guess_max = 10000)

temp = author_questions %>%
  filter(`Question Text` == "What is your gender identity?") %>%
  filter(`Author Response` != "Prefer not to answer") %>%
  filter(!is.na(`Author Response`)) %>%
  arrange(`Author's People ID`, desc(`Manuscript Number`)) %>%
  group_by(`Author's People ID`) %>%
  slice(n=1) %>%
  select(`Author's People ID`, `Author Response`) %>%
  rename(people_unique_id = `Author's People ID`, 
         gender_selfid = `Author Response`) %>%
  mutate(gender_selfid = if_else(gender_selfid == "Non-binary", NA_character_, gender_selfid), 
         gender_selfid = tolower(gender_selfid)) %>%
  filter(!is.na(gender_selfid))


# add the selfid to the main df
people = people %>%
  left_join(temp, by = "people_unique_id")
people$gender_selfid = as_factor(people$gender_selfid)

table(people$gender_selfid, exclude = NULL)
table(people$gender_api, people$gender_selfid, exclude = NULL)

selff_codem = people %>%
  filter(gender_api == "male" & gender_selfid == "female")
nrow(selff_codem)
selfm_codef = people %>%
  filter(gender_api == "female" & gender_selfid == "male")
nrow(selfm_codef)


#  if title == selfid then TRUE, FALSE
people$self_title = if_else(people$gender_title == people$gender_selfid, TRUE,FALSE) 



table(people$gender_api, exclude = NULL)
table(people$gender_confid, exclude = NULL)
table(people$gender_api, people$gender_confid, exclude = NULL)

table(people$gender_title, exclude = NULL)
table(people$gender_selfid, exclude = NULL)
table(people$gender_title, people$gender_selfid, exclude = NULL)

# confidence rows X matching selfid/title
table(people$gender_confid, people$self_title, exclude = NULL)


# start with predicted gender based on name
people$gender_coded = as_factor(people$gender_api)
table(people$gender_coded, exclude = NULL)

# if api didn't generate a name, but the self reported title and selfid as author are the same, use those
# if api name is NA & self_title == TRUE then use self-id as gender
people$gender_coded = if_else((is.na(people$gender_api) & people$self_title == TRUE), people$gender_selfid, people$gender_coded)
table(people$gender_coded, exclude = NULL)

# replace the api coded gender with the self identified gender only if 
# there's low confidence in the api gender & the title & author selfid match & they are not NA
people$gender_coded = if_else(is.na(people$gender_coded) & people$gender_confid == FALSE & people$self_title == TRUE, people$gender_selfid, people$gender_coded)
table(people$gender_coded, exclude = NULL)


write_csv(people, here("data","gender", "people_gender.csv"))

